// Copyright 2024 The Google Research Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.



syntax = "proto2";

package research_scann;

import "scann/proto/distance_measure.proto";
import "scann/proto/exact_reordering.proto";
import "scann/proto/input_output.proto";
import "scann/proto/projection.proto";

message PartitioningConfig {
  enum TreeType {
    KMEANS_TREE = 0;

    PCA_TREE = 1;

    RANDOM_PROJECTION_TREE = 2;

    BALL_TREE = 3;

    RANDOM = 4;

    TREE_X_HYBRID = 5;
  }

  optional TreeType tree_type = 31 [default = KMEANS_TREE];

  optional TreeXHybridPartitioningConfig tree_x_hybrid = 46;

  oneof SamplingFractionOrExpectedSize {
    float partitioning_sampling_fraction = 4 [default = 1.0];

    int32 expected_sample_size = 45;
  }

  optional string partitioner_prefix = 8;

  optional int32 max_num_levels = 2 [default = 1];

  optional int32 max_leaf_size = 11 [default = 1];

  optional int32 clustering_seed = 27 [default = 0];

  optional ProjectionConfig projection = 12;

  optional int32 num_children = 3 [default = 2];

  optional DistanceMeasureConfig partitioning_distance = 10;

  optional DistanceMeasureConfig database_tokenization_distance_override = 24;

  optional DistanceMeasureConfig query_tokenization_distance_override = 25;

  enum TokenizationType {
    FLOAT = 1;

    FIXED_POINT_INT8 = 2;

    ASYMMETRIC = 3;

    reserved 0;
  }

  optional TokenizationType query_tokenization_type = 28 [default = FLOAT];

  optional TokenizationType database_tokenization_type = 29 [default = FLOAT];

  optional int32 max_clustering_iterations = 6 [default = 10];

  optional int32 num_mini_batches = 38 [default = 1];

  optional float clustering_convergence_tolerance = 7 [default = 1e-5];

  optional float min_cluster_size = 9 [default = 1];

  optional int32 max_cluster_size = 40;

  optional double perturbation = 41;

  enum PartitioningType {
    GENERIC = 0;

    SPHERICAL = 1;
  }

  optional PartitioningType partitioning_type = 23 [default = GENERIC];

  enum BalancingType {
    DEFAULT_UNBALANCED = 0;

    GREEDY_BALANCED = 1;

    UNBALANCED_FLOAT32 = 2;
  }

  optional BalancingType balancing_type = 35 [default = UNBALANCED_FLOAT32];

  enum SingleMachineCenterInitializationType {
    DEFAULT_KMEANS_PLUS_PLUS = 0;

    RANDOM_INITIALIZATION = 1;
  }

  optional SingleMachineCenterInitializationType
      single_machine_center_initialization = 49
      [default = DEFAULT_KMEANS_PLUS_PLUS];

  optional DatabaseSpillingConfig database_spilling = 20;

  optional QuerySpillingConfig query_spilling = 21;

  optional string resharded_prefix = 14;

  optional int32 num_partitioning_epochs = 1 [default = 1];

  optional string mr_jobname_prefix = 13 [default = "PartitionReshard"];

  optional string cell = 15;

  optional int32 num_cpus = 19 [default = 1];

  optional int32 num_mapper_machines = 30 [default = -1];

  optional int32 ram_gb = 16 [default = 4];

  optional int32 disk_gb = 17 [default = 1];

  enum PartitionerTrainerType {
    DEFAULT_SAMPLING_TRAINER = 0;

    FLUME_KMEANS_TRAINER = 1;

    PCA_KMEANS_TRAINER = 3;

    SAMPLING_PCA_KMEANS_TRAINER = 4;
  }
  optional PartitionerTrainerType trainer_type = 36
      [default = DEFAULT_SAMPLING_TRAINER];

  optional string trainer_stats_prefix = 37;

  optional int32 max_power_of_2_split = 39 [default = 2];

  optional int32 num_top_tokens_for_pca = 42 [default = 20];

  optional float pca_splitting_similarity_threshold = 43 [default = 1e-5];

  optional int32 desired_average_cluster_size = 34 [default = 3000];

  reserved 44;
  reserved "kmeans_flume_config";

  optional float avq = 51 [default = nan];

  optional IncrementalTrainingConfig incremental_training_config = 52;

  optional bool use_float_centers_for_query_tokenization = 26
      [default = false, deprecated = true];

  optional bool partitioning_on_the_fly = 22
      [default = false, deprecated = true];

  optional bool use_flume_kmeans = 32 [default = false, deprecated = true];

  optional int32 max_sample_size = 33 [default = 2147483647, deprecated = true];

  reserved 47, 48;
}

message DatabaseSpillingConfig {
  enum SpillingType {
    NO_SPILLING = 0;

    MULTIPLICATIVE = 1;

    ADDITIVE = 2;

    FIXED_NUMBER_OF_CENTERS = 3;

    TWO_CENTER_ORTHOGONALITY_AMPLIFIED = 4;

    SOAR = 4;

    option allow_alias = true;
  }

  optional SpillingType spilling_type = 1 [default = NO_SPILLING];

  optional float replication_factor = 2;

  optional uint32 max_spill_centers = 3 [default = 4294967295];

  optional float orthogonality_amplification_lambda = 4 [default = 1.5];

  optional float overretrieve_factor = 5 [default = 2.0];
}

message QuerySpillingConfig {
  enum SpillingType {
    NO_SPILLING = 0;

    MULTIPLICATIVE = 1;

    ADDITIVE = 2;

    ABSOLUTE_DISTANCE = 3;

    FIXED_NUMBER_OF_CENTERS = 4;
  }

  optional SpillingType spilling_type = 1 [default = NO_SPILLING];

  optional float spilling_threshold = 2;

  optional uint32 max_spill_centers = 3 [default = 4294967295];
}

message IncrementalTrainingConfig {
  oneof reassignment_trigger {
    float fraction = 1 [default = inf];

    uint32 number_of_datapoints = 2 [default = 4294967295];
  }

  optional uint32 cluster_stability_size = 3 [default = 200];

  optional bool autopilot = 4 [default = false];

  optional uint32 max_split = 5 [default = 4294967295];
}

message TreeXHybridPartitioningConfig {
  optional uint32 top_partitioning_children = 1;

  optional float top_partitioner_spilling_ratio = 2 [default = 0.05];

  optional PartitioningConfig.TokenizationType
      top_partitioner_query_tokenization_type = 7 [default = FLOAT];
  optional PartitioningConfig.TokenizationType
      top_partitioner_database_tokenization_type = 8 [default = FLOAT];
  optional PartitioningConfig.TokenizationType
      top_partitioner_training_tokenization_type = 10 [default = FLOAT];

  optional bool top_partitioner_use_flume_trainer = 9 [default = false];

  optional string assets_dir = 3;

  optional ExactReordering query_tokenization_reordering = 6;

  optional ExactReordering training_reordering = 4 [deprecated = true];
  optional ExactReordering database_tokenization_reordering = 5
      [deprecated = true];
}

message TokenList {
  optional uint64 id = 1;
  repeated int64 datapoint_index = 2;
}

message HierarchicalPartitionerConfig {
  optional InputOutputConfig input_output = 1;

  repeated PartitioningConfig partitioning = 2;
}
